<<<<<<< HEAD ======= >>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f Salary Trends in Data Science

5. Statistical Analysis & Hypothesis Testing

Shapiro-Wilk test

<<<<<<< HEAD
set.seed(123)
# Subset the data into Full-time and Part-time groups
full_time_data <- cleaned_salaries[cleaned_salaries$employment_type == "Full-time", ]
# using only 5000 because shapiro only accepts sample size between 3 to 5000
full_time_sample <- full_time_data[sample(nrow(full_time_data), 5000), ]
part_time_data <- cleaned_salaries[cleaned_salaries$employment_type == "Part-time", ]
contract_data <- cleaned_salaries[cleaned_salaries$employment_type=="Contract", ]

# Conduct the Shapiro-Wilk test for Full-time salaries
shapiro_test_full_time <- shapiro.test(full_time_sample$salary_in_usd)
print(shapiro_test_full_time)
=======
set.seed(123)
# Subset the data into Full-time and Part-time groups
full_time_data <- cleaned_salaries[cleaned_salaries$employment_type == "Full-time", ]
# using only 5000 because shapiro only accepts sample size between 3 to 5000
full_time_sample <- full_time_data[sample(nrow(full_time_data), 5000), ]
part_time_data <- cleaned_salaries[cleaned_salaries$employment_type == "Part-time", ]
contract_data <- cleaned_salaries[cleaned_salaries$employment_type=="Contract", ]

# Conduct the Shapiro-Wilk test for Full-time salaries
shapiro_test_full_time <- shapiro.test(full_time_sample$salary_in_usd)
print(shapiro_test_full_time)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  full_time_sample$salary_in_usd
## W = 0.98061, p-value < 2.2e-16
<<<<<<< HEAD
# Conduct the Shapiro-Wilk test for Part-time salaries
shapiro_test_part_time <- shapiro.test(part_time_data$salary_in_usd)
print(shapiro_test_part_time)
=======
# Conduct the Shapiro-Wilk test for Part-time salaries
shapiro_test_part_time <- shapiro.test(part_time_data$salary_in_usd)
print(shapiro_test_part_time)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  part_time_data$salary_in_usd
## W = 0.91907, p-value = 0.4491
<<<<<<< HEAD
shapiro.test(contract_data$salary_in_usd)
=======
shapiro.test(contract_data$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  contract_data$salary_in_usd
## W = 0.86805, p-value = 0.1172

Null Hypothesis: The salary data for Full-time employees is normally distributed. Alternate Hypothesis: The salary data for Full-time employees is not normally distributed. Results: Since the p-value is less than 2.2e-16, which is far smaller than the significance level (α = 0.05), we reject the null hypothesis. This suggests that there is enough evidence to conclude that the salary data for Full-time employees is not normally distributed.

Null Hypothesis: The salary data for Part-time employees is normally distributed. Alternate Hypothesis: Part-time salary data does not significantly deviate from normality, meaning it is more likely to be normally distributed. Results: Since the p-value is 0.4491, which is greater than the significance level (α = 0.05), we fail to reject the null hypothesis. This suggests that there is insufficient evidence to conclude that the salary data for Part-time employees is not normally distributed.

Null Hypothesis: The salary data for Contract employees is normally distributed. Alternate Hypothesis: The salary data for Contract employees is not normally distributed. Results: Since the p-value is 0.1172, which is greater than the significance level (α = 0.05), we fail to reject the null hypothesis. This suggests that there is insufficient evidence to conclude that the salary data for Contract employees is not normally distributed.

<<<<<<< HEAD
data_manager <- cleaned_salaries[cleaned_salaries$job_title == "Data Manager", ]
shapiro.test(data_manager$salary_in_usd)
=======
data_manager <- cleaned_salaries[cleaned_salaries$job_title == "Data Manager", ]
shapiro.test(data_manager$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  data_manager$salary_in_usd
## W = 0.97033, p-value = 0.1905
<<<<<<< HEAD
research_analyst <- cleaned_salaries[cleaned_salaries$job_title == "Research Analyst", ]
shapiro.test(research_analyst$salary_in_usd)
=======
research_analyst <- cleaned_salaries[cleaned_salaries$job_title == "Research Analyst", ]
shapiro.test(research_analyst$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  research_analyst$salary_in_usd
## W = 0.90298, p-value = 0.0003165
<<<<<<< HEAD
data_engineer <- cleaned_salaries[cleaned_salaries$job_title == "Data Engineer", ]
shapiro.test(data_engineer$salary_in_usd)
=======
data_engineer <- cleaned_salaries[cleaned_salaries$job_title == "Data Engineer", ]
shapiro.test(data_engineer$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  data_engineer$salary_in_usd
## W = 0.97815, p-value = 1.745e-11
<<<<<<< HEAD
data_scientist <- cleaned_salaries[cleaned_salaries$job_title == "Data Scientist", ]
shapiro.test(data_scientist$salary_in_usd)
=======
data_scientist <- cleaned_salaries[cleaned_salaries$job_title == "Data Scientist", ]
shapiro.test(data_scientist$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  data_scientist$salary_in_usd
## W = 0.98507, p-value = 1.818e-08
<<<<<<< HEAD
data_analyst <- cleaned_salaries[cleaned_salaries$job_title == "Data Analyst", ]
shapiro.test(data_analyst$salary_in_usd)
=======
data_analyst <- cleaned_salaries[cleaned_salaries$job_title == "Data Analyst", ]
shapiro.test(data_analyst$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  data_analyst$salary_in_usd
## W = 0.96718, p-value = 6.878e-12
<<<<<<< HEAD
analytics_engineer <- cleaned_salaries[cleaned_salaries$job_title == "Analytics Engineer", ]
shapiro.test(analytics_engineer$salary_in_usd)
=======
analytics_engineer <- cleaned_salaries[cleaned_salaries$job_title == "Analytics Engineer", ]
shapiro.test(analytics_engineer$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  analytics_engineer$salary_in_usd
## W = 0.98744, p-value = 0.07004

Null Hypothesis: The salary data for Data Manager is normally distributed. Alternate hypothesis: The salary data for Data Manager is not normally distributed. results: Since the p-value (0.1905) is greater than the significance level (α = 0.05), we fail to reject the null hypothesis. This suggests that there is not enough evidence to conclude that the salary data for Data Manager is not normally distributed. Therefore, we can assume that the data follows a normal distribution.

Null Hypothesis: The salary data for research analyst is normally distributed. Alternate hypothesis: The salary data for research analyst is not normally distributed. results: Since the p-value (0.0003165) is much less than the significance level (α = 0.05), we reject the null hypothesis. This indicates that there is enough evidence to conclude that the salary data for Research Analysts is not normally distributed.

Final Result: From the above results we can conclude that only “Data Manager” and “Analytics Engineer” have normal distribution

<<<<<<< HEAD
qqnorm(data_manager$salary_in_usd, main = "Q-Q Plot for Data Manager Salaries")
qqline(data_manager$salary_in_usd, col = "red", lwd = 2)

qqnorm(analytics_engineer$salary_in_usd, main = "Q-Q Plot for Analytics Engineer Salaries")
qqline(analytics_engineer$salary_in_usd, col = "red", lwd = 2)

T-test

t_test_result <- t.test(data_manager$salary_in_usd, analytics_engineer$salary_in_usd)

# Print the t-test result
print(t_test_result)
=======
qqnorm(data_manager$salary_in_usd, main = "Q-Q Plot for Data Manager Salaries")
qqline(data_manager$salary_in_usd, col = "red", lwd = 2)

qqnorm(analytics_engineer$salary_in_usd, main = "Q-Q Plot for Analytics Engineer Salaries")
qqline(analytics_engineer$salary_in_usd, col = "red", lwd = 2)

T-test

t_test_result <- t.test(data_manager$salary_in_usd, analytics_engineer$salary_in_usd)

# Print the t-test result
print(t_test_result)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Welch Two Sample t-test
## 
## data:  data_manager$salary_in_usd and analytics_engineer$salary_in_usd
## t = -6.1742, df = 97.927, p-value = 1.513e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -51096.20 -26239.23
## sample estimates:
## mean of x mean of y 
##  116371.2  155038.9

Null Hypothesis: The average salaries of Data Managers and Analytics Engineers are equal. Alternative Hypothesis: The average salaries of Data Managers and Analytics Engineers are not equal. Results: Given that the p-value is 1.513e-08 (which is much smaller than 0.05), we reject the null hypothesis. This means that there is a statistically significant difference in the average salaries of Data Managers and Analytics Engineers, with Analytics Engineers earning more on average than Data Managers.

<<<<<<< HEAD
#Conduct shapiro test and t-test if applicable on work models column
unique(cleaned_salaries$work_models)
## [1] "Remote"  "On-site" "Hybrid"
remote <- cleaned_salaries[cleaned_salaries$work_models == "Remote", ]
shapiro.test(remote$salary_in_usd)
=======
#Conduct shapiro test and t-test if applicable on work models column
unique(cleaned_salaries$work_models)
## [1] "Remote"  "On-site" "Hybrid"
remote <- cleaned_salaries[cleaned_salaries$work_models == "Remote", ]
shapiro.test(remote$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  remote$salary_in_usd
## W = 0.98781, p-value = 7.492e-12
<<<<<<< HEAD
on_site_salaries <- cleaned_salaries[cleaned_salaries$work_models == "On-site", ]
shapiro.test(on_site_salaries$salary_in_usd)
=======
on_site_salaries <- cleaned_salaries[cleaned_salaries$work_models == "On-site", ]
shapiro.test(on_site_salaries$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  on_site_salaries$salary_in_usd
## W = 0.97608, p-value < 2.2e-16
<<<<<<< HEAD
hybrid <- cleaned_salaries[cleaned_salaries$work_models == "Hybrid", ]
shapiro.test(hybrid$salary_in_usd)
=======
hybrid <- cleaned_salaries[cleaned_salaries$work_models == "Hybrid", ]
shapiro.test(hybrid$salary_in_usd)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Shapiro-Wilk normality test
## 
## data:  hybrid$salary_in_usd
## W = 0.94075, p-value = 0.03666

Null Hypothesis: All of the work models are normally distributed Alternate Hypothesis: None of the work models are normally distributed Results: Since all of the p-values(7.492e-12, 2.2e-16, 0.03666) are less than 0.05 we reject the null hypothesis. None of the work models (Remote, On-site, or Hybrid) have salaries that follow a normal distribution based on the Shapiro-Wilk test results.

<<<<<<< HEAD
cleaned_salaries
=======
cleaned_salaries
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f

Chi-Squared Test

conducting chi-squared test between employment_type and work_models

<<<<<<< HEAD
table_data <- table(cleaned_salaries$employment_type, cleaned_salaries$work_models)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Employment Type and Work Models")
## Chi-Squared Test for Employment Type and Work Models
print(chisq_test)
=======
table_data <- table(cleaned_salaries$employment_type, cleaned_salaries$work_models)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Employment Type and Work Models")
## Chi-Squared Test for Employment Type and Work Models
print(chisq_test)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Pearson's Chi-squared test
## 
## data:  table_data
## X-squared = 24.389, df = 4, p-value = 6.673e-05
<<<<<<< HEAD
chisq_test$expected
=======
chisq_test$expected
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
##            
##                  Hybrid     On-site      Remote
##   Contract   0.06917756    5.534204    3.396618
##   Full-time 39.90776326 3192.621061 1959.471176
##   Part-time  0.02305919    1.844735    1.132206

Null Hypothesis: There is no association between employment type and work model (they are independent). Alternate Hypothesis: There is an association between employment type and work model (they are dependent). Results: Since the p-value(6.673e-05) is less than 0.05, we reject the null hypothesis. There is a statistically significant association between employment type and work model. However, we should be cautious about these results because, for a chi-squared test, the expected frequencies should be greater than 5. In this case, the expected counts for Hybrid: Contract, Contract:Remote, Part-time: Hybrid, Part-time: On-site, and Part-time: Remote are all less than 5, which may affect the reliability of the test.

conducting chi-squared test between company_size and work_models

<<<<<<< HEAD
table_data <- table(cleaned_salaries$company_size, cleaned_salaries$work_models)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Company Size and Work Models")
## Chi-Squared Test for Company Size and Work Models
print(chisq_test)
=======
table_data <- table(cleaned_salaries$company_size, cleaned_salaries$work_models)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Company Size and Work Models")
## Chi-Squared Test for Company Size and Work Models
print(chisq_test)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Pearson's Chi-squared test
## 
## data:  table_data
## X-squared = 421.91, df = 4, p-value < 2.2e-16
<<<<<<< HEAD
chisq_test$expected
=======
chisq_test$expected
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
##         
##              Hybrid    On-site    Remote
##   Large   2.3904689  191.23751  117.3720
##   Medium 37.2252114 2978.01691 1827.7579
##   Small   0.3843198   30.74558   18.8701

Null Hypothesis: There is no association between company_size and work model (they are independent). Alternate Hypothesis: There is an association between company_size and work model (they are dependent). Results: Since the p-value(<2.2e-16) is less than 0.05, we reject the null hypothesis. There is a statistically significant association between company_size and work model. However, we should be cautious about these results because, for a chi-squared test, the expected frequencies should be greater than 5. In this case, the expected counts for Hybrid: Large, small:hybrid are all less than 5, which may affect the reliability of the test.

conducting chi-squared test between experience level and work_models

<<<<<<< HEAD
table_data <- table(cleaned_salaries$work_models, cleaned_salaries$experience_level)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Experience Level and Work Models")
## Chi-Squared Test for Experience Level and Work Models
print(chisq_test)
=======
table_data <- table(cleaned_salaries$work_models, cleaned_salaries$experience_level)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Experience Level and Work Models")
## Chi-Squared Test for Experience Level and Work Models
print(chisq_test)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Pearson's Chi-squared test
## 
## data:  table_data
## X-squared = 104.43, df = 6, p-value < 2.2e-16
<<<<<<< HEAD
chisq_test$expected
=======
chisq_test$expected
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
##          
##           Entry-level  Mid-level Senior-level Executive-level
##   Hybrid     2.774789   8.862414     26.81783        1.544965
##   On-site  221.983090 708.993082   2145.42659      123.597233
##   Remote   136.242121 435.144504   1316.75557       75.857802

Null Hypothesis: There is no association between work model and experience level (they are independent). Alternate Hypothesis: There is an association between work model and experience level (they are dependent). Results: Since the p-value(<2.2e-16) is less than 0.05, we reject the null hypothesis. There is a statistically significant association between work model and experience level. However, we should be cautious about these results because, for a chi-squared test, the expected frequencies should be greater than 5. In this case, the expected counts for Hybrid: Entry-level, and Hybrid:Executive-level are all less than 5, which may affect the reliability of the test.

conducting chi-squared test between experience level and employement type

<<<<<<< HEAD
table_data <- table(cleaned_salaries$experience_level, cleaned_salaries$employment_type)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Experience Level and Employment Type")
## Chi-Squared Test for Experience Level and Employment Type
print(chisq_test)
=======
table_data <- table(cleaned_salaries$experience_level, cleaned_salaries$employment_type)
chisq_test <- chisq.test(table_data)
## Warning in chisq.test(table_data): Chi-squared approximation may be incorrect
cat("\n\n")
cat("Chi-Squared Test for Experience Level and Employment Type")
## Chi-Squared Test for Experience Level and Employment Type
print(chisq_test)
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
## 
##  Pearson's Chi-squared test
## 
## data:  table_data
## X-squared = 50.918, df = 6, p-value = 3.076e-09
<<<<<<< HEAD
chisq_test$expected
=======
chisq_test$expected
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f
##                  
##                    Contract Full-time Part-time
##   Entry-level     0.6243274  360.1676 0.2081091
##   Mid-level       1.9940430 1150.3413 0.6646810
##   Senior-level    6.0340123 3480.9547 2.0113374
##   Executive-level 0.3476172  200.5365 0.1158724

Null Hypothesis: There is no association between experience level and employment type(they are independent). Alternate Hypothesis: There is an association between experience level and employment type(they are dependent). Results: Since the p-value(3.076e-09) is less than 0.05, we reject the null hypothesis. There is a statistically significant association between experience level and employment type. However, we should be cautious about these results because, for a chi-squared test, the expected frequencies should be greater than 5. In this case, the expected counts for Entry-level:contract, Executive-level:contract, Entry-level:Part-time, Executive-level and part-time, mid-level:contract, mid-level:part-time and senior-level:part-time are all less than 5, which may affect the reliability of the test.

6. Conclusion

📌 Key Takeaways:
Salaries have increased from 2020 to 2024, likely due to growing demand.
Experience level significantly impacts salary, with senior professionals earning much more.
Remote jobs offer higher salaries than on-site roles.
Larger companies tend to pay better than small organizations.
Statistical tests confirm significant relationships between salary, experience, work model, and company size.

References

  1. Kaggle: Data Science Salaries Dataset
  2. Tidyverse Package Documentation
  3. R Graphics Cookbook
<<<<<<< HEAD
=======

7. Continuing Final Project – Data Preparation for Modeling (Yeobi’s Part)

This section finalizes the dataset for model training and evaluation.

Step 1: Re-create cleaned_salaries by removing outliers using IQR

Q1 <- quantile(us_residents_df$salary_in_usd, 0.25)
Q3 <- quantile(us_residents_df$salary_in_usd, 0.75)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

cleaned_salaries <- us_residents_df[us_residents_df$salary_in_usd >= lower_bound &
                                    us_residents_df$salary_in_usd <= upper_bound, ]

Step 2: Convert categorical columns to factors

cat_vars <- c("job_title", "employment_type", "work_models", "company_size", "experience_level")
cleaned_salaries[cat_vars] <- lapply(cleaned_salaries[cat_vars], as.factor)

Step 3: Split the data into 80/20 train-test sets

set.seed(123)
train_index <- sample(seq_len(nrow(cleaned_salaries)), size = 0.8 * nrow(cleaned_salaries))
train_df <- cleaned_salaries[train_index, ]
test_df <- cleaned_salaries[-train_index, ]

Step 4: Ensure consistency in factor levels between training and test sets

for (col in cat_vars) {
  test_df[[col]] <- factor(test_df[[col]], levels = levels(train_df[[col]]))
}

Step 5: View structure of resulting datasets

str(train_df)
## 'data.frame':    4163 obs. of  8 variables:
##  $ job_title       : Factor w/ 104 levels "AI Architect",..: 36 36 59 79 53 59 79 92 9 36 ...
##  $ experience_level: Factor w/ 4 levels "Entry-level",..: 3 4 4 4 4 3 4 2 4 4 ...
##  $ employment_type : Factor w/ 3 levels "Contract","Full-time",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ work_models     : Factor w/ 3 levels "Hybrid","On-site",..: 2 2 2 2 2 2 3 2 2 3 ...
##  $ work_year       : int  2023 2023 2023 2023 2022 2023 2023 2023 2023 2021 ...
##  $ salary_in_usd   : int  95000 175000 194500 170000 122000 185000 205600 200000 246500 115000 ...
##  $ company_location: chr  "United States" "United States" "United States" "United States" ...
##  $ company_size    : Factor w/ 3 levels "Large","Medium",..: 2 2 2 2 2 2 2 2 2 3 ...
str(test_df)
## 'data.frame':    1041 obs. of  8 variables:
##  $ job_title       : Factor w/ 104 levels "AI Architect",..: 59 14 99 36 59 59 79 57 9 36 ...
##  $ experience_level: Factor w/ 4 levels "Entry-level",..: 4 3 1 3 3 3 4 4 3 3 ...
##  $ employment_type : Factor w/ 3 levels "Contract","Full-time",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ work_models     : Factor w/ 3 levels "Hybrid","On-site",..: 3 2 2 2 2 2 2 3 2 3 ...
##  $ work_year       : int  2024 2024 2024 2024 2024 2024 2024 2024 2024 2024 ...
##  $ salary_in_usd   : int  100022 62100 150000 145500 179000 112000 267900 90000 222200 129600 ...
##  $ company_location: chr  "United States" "United States" "United States" "United States" ...
##  $ company_size    : Factor w/ 3 levels "Large","Medium",..: 2 2 2 2 2 2 2 2 1 2 ...

8. MLP Regression Model (Yeobi)

This section builds a simple neural network model to predict salary using the cleaned dataset.

Step 1: Load required package

library(nnet)

Step 2: Normalize the target variable (salary_in_usd)

normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}

train_df$salary_scaled <- normalize(train_df$salary_in_usd)
test_df$salary_scaled <- normalize(test_df$salary_in_usd)

Step 3: Train MLP regression model

set.seed(123)  # for reproducibility
mlp_model <- nnet(
  salary_scaled ~ job_title + employment_type + work_models + company_size + experience_level + work_year,
  data = train_df,
  size = 5,      # number of neurons in the hidden layer
  linout = TRUE, # regression mode
  maxit = 500    # number of iterations
)
## # weights:  576
## initial  value 6633.611089 
## final  value 159.081094 
## converged

Step 4: Predict on test set

mlp_preds_scaled <- predict(mlp_model, newdata = test_df)

Step 5: Rescale predictions back to original salary scale

salary_min <- min(test_df$salary_in_usd)
salary_max <- max(test_df$salary_in_usd)
mlp_preds <- mlp_preds_scaled * (salary_max - salary_min) + salary_min

Step 6: Evaluate model performance using RMSE

rmse <- sqrt(mean((mlp_preds - test_df$salary_in_usd)^2))
cat("MLP Regression RMSE on test set:", round(rmse, 2), "\n")
## MLP Regression RMSE on test set: 57593.42
>>>>>>> eac004f732ea296f77b32f5d37cfa0f44f01979f